1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.solr.internal.csv;
18
19 import java.io.IOException;
20 import java.io.Reader;
21 import java.io.InputStreamReader;
22 import java.io.InputStream;
23 import java.util.ArrayList;
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 public class CSVParser {
53
54
55 private static final int INITIAL_TOKEN_LENGTH = 50;
56
57
58
59 protected static final int TT_INVALID = -1;
60
61 protected static final int TT_TOKEN = 0;
62
63 protected static final int TT_EOF = 1;
64
65 protected static final int TT_EORECORD = 2;
66
67
68 private static final String[] EMPTY_STRING_ARRAY = new String[0];
69
70
71 private final ExtendedBufferedReader in;
72
73 private final CSVStrategy strategy;
74
75
76
77 private final ArrayList record = new ArrayList();
78 private final Token reusableToken = new Token();
79 private final CharBuffer wsBuf = new CharBuffer();
80 private final CharBuffer code = new CharBuffer(4);
81
82
83
84
85
86
87
88 static class Token {
89
90 int type = TT_INVALID;
91
92 CharBuffer content = new CharBuffer(INITIAL_TOKEN_LENGTH);
93
94 boolean isReady;
95
96 Token reset() {
97 content.clear();
98 type = TT_INVALID;
99 isReady = false;
100 return this;
101 }
102 }
103
104
105
106
107
108
109
110
111
112
113 public CSVParser(Reader input) {
114
115 this(input, ',');
116 }
117
118
119
120
121
122
123
124
125
126
127
128 public CSVParser(Reader input, char delimiter) {
129 this(input, delimiter, '"', CSVStrategy.COMMENTS_DISABLED);
130 }
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145 public CSVParser(Reader input, char delimiter, char encapsulator, char commentStart) {
146 this(input, new CSVStrategy(delimiter, encapsulator, commentStart));
147 }
148
149
150
151
152
153
154
155 public CSVParser(Reader input, CSVStrategy strategy) {
156 this.in = new ExtendedBufferedReader(input);
157 this.strategy = strategy;
158 }
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175 public String[][] getAllValues() throws IOException {
176 ArrayList records = new ArrayList();
177 String[] values;
178 String[][] ret = null;
179 while ((values = getLine()) != null) {
180 records.add(values);
181 }
182 if (records.size() > 0) {
183 ret = new String[records.size()][];
184 records.toArray(ret);
185 }
186 return ret;
187 }
188
189
190
191
192
193
194
195
196 public String nextValue() throws IOException {
197 Token tkn = nextToken();
198 String ret = null;
199 switch (tkn.type) {
200 case TT_TOKEN:
201 case TT_EORECORD:
202 ret = tkn.content.toString();
203 break;
204 case TT_EOF:
205 ret = null;
206 break;
207 case TT_INVALID:
208 default:
209
210 throw new IOException(
211 "(line " + getLineNumber()
212 + ") invalid parse sequence");
213
214 }
215 return ret;
216 }
217
218
219
220
221
222
223
224
225
226 public String[] getLine() throws IOException {
227 String[] ret = EMPTY_STRING_ARRAY;
228 record.clear();
229 while (true) {
230 reusableToken.reset();
231 nextToken(reusableToken);
232 switch (reusableToken.type) {
233 case TT_TOKEN:
234 record.add(reusableToken.content.toString());
235 break;
236 case TT_EORECORD:
237 record.add(reusableToken.content.toString());
238 break;
239 case TT_EOF:
240 if (reusableToken.isReady) {
241 record.add(reusableToken.content.toString());
242 } else {
243 ret = null;
244 }
245 break;
246 case TT_INVALID:
247 default:
248
249 throw new IOException("(line " + getLineNumber() + ") invalid parse sequence");
250
251 }
252 if (reusableToken.type != TT_TOKEN) {
253 break;
254 }
255 }
256 if (!record.isEmpty()) {
257 ret = (String[]) record.toArray(new String[record.size()]);
258 }
259 return ret;
260 }
261
262
263
264
265
266
267
268
269
270 public int getLineNumber() {
271 return in.getLineNumber();
272 }
273
274
275
276
277
278
279
280
281 protected Token nextToken() throws IOException {
282 return nextToken(new Token());
283 }
284
285
286
287
288
289
290
291
292
293
294
295
296 protected Token nextToken(Token tkn) throws IOException {
297 wsBuf.clear();
298
299
300 int lastChar = in.readAgain();
301
302
303
304
305
306
307
308 int c = in.read();
309 boolean eol = isEndOfLine(c);
310 c = in.readAgain();
311
312
313 while (strategy.getIgnoreEmptyLines() && eol
314 && (lastChar == '\n'
315 || lastChar == ExtendedBufferedReader.UNDEFINED)
316 && !isEndOfFile(lastChar)) {
317
318 lastChar = c;
319 c = in.read();
320 eol = isEndOfLine(c);
321 c = in.readAgain();
322
323 if (isEndOfFile(c)) {
324 tkn.type = TT_EOF;
325 return tkn;
326 }
327 }
328
329
330 if (isEndOfFile(lastChar) || (lastChar != strategy.getDelimiter() && isEndOfFile(c))) {
331 tkn.type = TT_EOF;
332 return tkn;
333 }
334
335
336 while (!tkn.isReady && tkn.type != TT_EOF) {
337
338 while (strategy.getIgnoreLeadingWhitespaces() && isWhitespace(c) && !eol) {
339 wsBuf.append((char) c);
340 c = in.read();
341 eol = isEndOfLine(c);
342 }
343
344 if (c == strategy.getCommentStart()) {
345
346 in.readLine();
347 tkn = nextToken(tkn.reset());
348 } else if (c == strategy.getDelimiter()) {
349
350 tkn.type = TT_TOKEN;
351 tkn.isReady = true;
352 } else if (eol) {
353
354
355 tkn.type = TT_EORECORD;
356 tkn.isReady = true;
357 } else if (c == strategy.getEncapsulator()) {
358
359 encapsulatedTokenLexer(tkn, c);
360 } else if (isEndOfFile(c)) {
361
362
363 tkn.type = TT_EOF;
364 tkn.isReady = true;
365 } else {
366
367
368 if (!strategy.getIgnoreLeadingWhitespaces()) {
369 tkn.content.append(wsBuf);
370 }
371 simpleTokenLexer(tkn, c);
372 }
373 }
374 return tkn;
375 }
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395 private Token simpleTokenLexer(Token tkn, int c) throws IOException {
396 for (;;) {
397 if (isEndOfLine(c)) {
398
399 tkn.type = TT_EORECORD;
400 tkn.isReady = true;
401 break;
402 } else if (isEndOfFile(c)) {
403
404 tkn.type = TT_EOF;
405 tkn.isReady = true;
406 break;
407 } else if (c == strategy.getDelimiter()) {
408
409 tkn.type = TT_TOKEN;
410 tkn.isReady = true;
411 break;
412 } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
413
414 tkn.content.append((char) unicodeEscapeLexer(c));
415 } else if (c == strategy.getEscape()) {
416 tkn.content.append((char)readEscape(c));
417 } else {
418 tkn.content.append((char) c);
419 }
420
421 c = in.read();
422 }
423
424 if (strategy.getIgnoreTrailingWhitespaces()) {
425 tkn.content.trimTrailingWhitespace();
426 }
427
428 return tkn;
429 }
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445 private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
446
447 int startLineNumber = getLineNumber();
448
449
450 for (;;) {
451 c = in.read();
452
453 if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
454 tkn.content.append((char) unicodeEscapeLexer(c));
455 } else if (c == strategy.getEscape()) {
456 tkn.content.append((char)readEscape(c));
457 } else if (c == strategy.getEncapsulator()) {
458 if (in.lookAhead() == strategy.getEncapsulator()) {
459
460 c = in.read();
461 tkn.content.append((char) c);
462 } else {
463
464 for (;;) {
465 c = in.read();
466 if (c == strategy.getDelimiter()) {
467 tkn.type = TT_TOKEN;
468 tkn.isReady = true;
469 return tkn;
470 } else if (isEndOfFile(c)) {
471 tkn.type = TT_EOF;
472 tkn.isReady = true;
473 return tkn;
474 } else if (isEndOfLine(c)) {
475
476 tkn.type = TT_EORECORD;
477 tkn.isReady = true;
478 return tkn;
479 } else if (!isWhitespace(c)) {
480
481 throw new IOException(
482 "(line " + getLineNumber()
483 + ") invalid char between encapsulated token end delimiter"
484 );
485 }
486 }
487 }
488 } else if (isEndOfFile(c)) {
489
490 throw new IOException(
491 "(startline " + startLineNumber + ")"
492 + "eof reached before encapsulated token finished"
493 );
494 } else {
495
496 tkn.content.append((char) c);
497 }
498 }
499 }
500
501
502
503
504
505
506
507
508
509
510
511 protected int unicodeEscapeLexer(int c) throws IOException {
512 int ret = 0;
513
514 c = in.read();
515 code.clear();
516 try {
517 for (int i = 0; i < 4; i++) {
518 c = in.read();
519 if (isEndOfFile(c) || isEndOfLine(c)) {
520 throw new NumberFormatException("number too short");
521 }
522 code.append((char) c);
523 }
524 ret = Integer.parseInt(code.toString(), 16);
525 } catch (NumberFormatException e) {
526 throw new IOException(
527 "(line " + getLineNumber() + ") Wrong unicode escape sequence found '"
528 + code.toString() + "'" + e.toString());
529 }
530 return ret;
531 }
532
533 private int readEscape(int c) throws IOException {
534
535 c = in.read();
536 int out;
537 switch (c) {
538 case 'r': out='\r'; break;
539 case 'n': out='\n'; break;
540 case 't': out='\t'; break;
541 case 'b': out='\b'; break;
542 case 'f': out='\f'; break;
543 default : out=c;
544 }
545 return out;
546 }
547
548
549
550
551
552
553
554
555
556
557 public CSVStrategy getStrategy() {
558 return this.strategy;
559 }
560
561
562
563
564
565
566
567
568 private boolean isWhitespace(int c) {
569 return Character.isWhitespace((char) c) && (c != strategy.getDelimiter());
570 }
571
572
573
574
575
576
577
578 private boolean isEndOfLine(int c) throws IOException {
579
580 if (c == '\r') {
581 if (in.lookAhead() == '\n') {
582
583 c = in.read();
584 }
585 }
586 return (c == '\n');
587 }
588
589
590
591
592 private boolean isEndOfFile(int c) {
593 return c == ExtendedBufferedReader.END_OF_STREAM;
594 }
595 }